{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Arcene"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.data -P ./data/arcene\n",
    "# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.labels -P ./data/arcene\n",
    "    \n",
    "# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_valid.data -P ./data/arcene\n",
    "# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/arcene_valid.labels -P ./data/arcene"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('./data/arcene/arcene_train.data',sep='\\ ',header= None)\n",
    "lab = pd.read_csv('./data/arcene/arcene_train.labels',header=None)\n",
    "\n",
    "df_train['target'] = lab[0].tolist()\n",
    "# df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_val = pd.read_csv('./data/arcene/arcene_valid.data',sep='\\ ',header= None)\n",
    "lab = pd.read_csv('./data/arcene/arcene_valid.labels',header=None)\n",
    "df_val['target'] = lab[0].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(200, 10001)\n"
     ]
    }
   ],
   "source": [
    "df = pd.concat([df_train, df_val])\n",
    "df['target'] = df['target'].map({-1:0,1:1}).tolist()\n",
    "\n",
    "print(df.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df.to_csv('./data/arcene.csv',sep=',',index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Arrhythmia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(452, 274)\n",
      "(452, 275)\n"
     ]
    }
   ],
   "source": [
    "import scipy.io\n",
    "matdata = scipy.io.loadmat('./data/arrhythmia.mat')\n",
    "train = matdata['X']\n",
    "print(train.shape)\n",
    "train  = np.append(train,matdata['y'],axis=1)\n",
    "print(train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>263</th>\n",
       "      <th>264</th>\n",
       "      <th>265</th>\n",
       "      <th>266</th>\n",
       "      <th>267</th>\n",
       "      <th>270</th>\n",
       "      <th>271</th>\n",
       "      <th>272</th>\n",
       "      <th>273</th>\n",
       "      <th>274</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>75.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>190.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>91.0</td>\n",
       "      <td>193.0</td>\n",
       "      <td>371.0</td>\n",
       "      <td>174.0</td>\n",
       "      <td>121.0</td>\n",
       "      <td>-16.0</td>\n",
       "      <td>...</td>\n",
       "      <td>62.9</td>\n",
       "      <td>-0.3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>-0.9</td>\n",
       "      <td>0.9</td>\n",
       "      <td>2.9</td>\n",
       "      <td>23.3</td>\n",
       "      <td>49.4</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>56.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>165.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>81.0</td>\n",
       "      <td>174.0</td>\n",
       "      <td>401.0</td>\n",
       "      <td>149.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>...</td>\n",
       "      <td>43.4</td>\n",
       "      <td>-0.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.2</td>\n",
       "      <td>2.1</td>\n",
       "      <td>20.4</td>\n",
       "      <td>38.8</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>54.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>172.0</td>\n",
       "      <td>95.0</td>\n",
       "      <td>138.0</td>\n",
       "      <td>163.0</td>\n",
       "      <td>386.0</td>\n",
       "      <td>185.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>...</td>\n",
       "      <td>48.2</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.5</td>\n",
       "      <td>-2.4</td>\n",
       "      <td>0.3</td>\n",
       "      <td>3.4</td>\n",
       "      <td>12.3</td>\n",
       "      <td>49.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>55.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>175.0</td>\n",
       "      <td>94.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>202.0</td>\n",
       "      <td>380.0</td>\n",
       "      <td>179.0</td>\n",
       "      <td>143.0</td>\n",
       "      <td>28.0</td>\n",
       "      <td>...</td>\n",
       "      <td>68.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>12.2</td>\n",
       "      <td>-2.2</td>\n",
       "      <td>0.4</td>\n",
       "      <td>2.6</td>\n",
       "      <td>34.6</td>\n",
       "      <td>61.6</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>75.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>190.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>88.0</td>\n",
       "      <td>181.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>177.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>-16.0</td>\n",
       "      <td>...</td>\n",
       "      <td>48.9</td>\n",
       "      <td>-0.4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13.1</td>\n",
       "      <td>-3.6</td>\n",
       "      <td>-0.1</td>\n",
       "      <td>3.9</td>\n",
       "      <td>25.4</td>\n",
       "      <td>62.8</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 228 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    0    1      2     3      4      5      6      7      8     9    ...   263  \\\n",
       "0  75.0  0.0  190.0  80.0   91.0  193.0  371.0  174.0  121.0 -16.0  ...  62.9   \n",
       "1  56.0  1.0  165.0  64.0   81.0  174.0  401.0  149.0   39.0  25.0  ...  43.4   \n",
       "2  54.0  0.0  172.0  95.0  138.0  163.0  386.0  185.0  102.0  96.0  ...  48.2   \n",
       "3  55.0  0.0  175.0  94.0  100.0  202.0  380.0  179.0  143.0  28.0  ...  68.0   \n",
       "4  75.0  0.0  190.0  80.0   88.0  181.0  360.0  177.0  103.0 -16.0  ...  48.9   \n",
       "\n",
       "   264  265   266  267  270  271   272   273  274  \n",
       "0 -0.3  0.0   9.0 -0.9  0.9  2.9  23.3  49.4  1.0  \n",
       "1 -0.5  0.0   8.5  0.0  0.2  2.1  20.4  38.8  0.0  \n",
       "2  0.9  0.0   9.5 -2.4  0.3  3.4  12.3  49.0  0.0  \n",
       "3  0.1  0.0  12.2 -2.2  0.4  2.6  34.6  61.6  0.0  \n",
       "4 -0.4  0.0  13.1 -3.6 -0.1  3.9  25.4  62.8  1.0  \n",
       "\n",
       "[5 rows x 228 columns]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(train)\n",
    "df = df.loc[:, df.std() > 0.08]\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.drop(columns=[18])\n",
    "\n",
    "df.head()\n",
    "df.to_csv('./data/arrhythmia.csv',sep=',',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>217</th>\n",
       "      <th>218</th>\n",
       "      <th>219</th>\n",
       "      <th>220</th>\n",
       "      <th>221</th>\n",
       "      <th>222</th>\n",
       "      <th>223</th>\n",
       "      <th>224</th>\n",
       "      <th>225</th>\n",
       "      <th>226</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>75.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>190.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>91.0</td>\n",
       "      <td>193.0</td>\n",
       "      <td>371.0</td>\n",
       "      <td>174.0</td>\n",
       "      <td>121.0</td>\n",
       "      <td>-16.0</td>\n",
       "      <td>...</td>\n",
       "      <td>62.9</td>\n",
       "      <td>-0.3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>-0.9</td>\n",
       "      <td>0.9</td>\n",
       "      <td>2.9</td>\n",
       "      <td>23.3</td>\n",
       "      <td>49.4</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>56.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>165.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>81.0</td>\n",
       "      <td>174.0</td>\n",
       "      <td>401.0</td>\n",
       "      <td>149.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>...</td>\n",
       "      <td>43.4</td>\n",
       "      <td>-0.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.2</td>\n",
       "      <td>2.1</td>\n",
       "      <td>20.4</td>\n",
       "      <td>38.8</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>54.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>172.0</td>\n",
       "      <td>95.0</td>\n",
       "      <td>138.0</td>\n",
       "      <td>163.0</td>\n",
       "      <td>386.0</td>\n",
       "      <td>185.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>...</td>\n",
       "      <td>48.2</td>\n",
       "      <td>0.9</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.5</td>\n",
       "      <td>-2.4</td>\n",
       "      <td>0.3</td>\n",
       "      <td>3.4</td>\n",
       "      <td>12.3</td>\n",
       "      <td>49.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>55.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>175.0</td>\n",
       "      <td>94.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>202.0</td>\n",
       "      <td>380.0</td>\n",
       "      <td>179.0</td>\n",
       "      <td>143.0</td>\n",
       "      <td>28.0</td>\n",
       "      <td>...</td>\n",
       "      <td>68.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>12.2</td>\n",
       "      <td>-2.2</td>\n",
       "      <td>0.4</td>\n",
       "      <td>2.6</td>\n",
       "      <td>34.6</td>\n",
       "      <td>61.6</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>75.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>190.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>88.0</td>\n",
       "      <td>181.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>177.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>-16.0</td>\n",
       "      <td>...</td>\n",
       "      <td>48.9</td>\n",
       "      <td>-0.4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13.1</td>\n",
       "      <td>-3.6</td>\n",
       "      <td>-0.1</td>\n",
       "      <td>3.9</td>\n",
       "      <td>25.4</td>\n",
       "      <td>62.8</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 227 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    0    1      2     3      4      5      6      7      8     9    ...   217  \\\n",
       "0  75.0  0.0  190.0  80.0   91.0  193.0  371.0  174.0  121.0 -16.0  ...  62.9   \n",
       "1  56.0  1.0  165.0  64.0   81.0  174.0  401.0  149.0   39.0  25.0  ...  43.4   \n",
       "2  54.0  0.0  172.0  95.0  138.0  163.0  386.0  185.0  102.0  96.0  ...  48.2   \n",
       "3  55.0  0.0  175.0  94.0  100.0  202.0  380.0  179.0  143.0  28.0  ...  68.0   \n",
       "4  75.0  0.0  190.0  80.0   88.0  181.0  360.0  177.0  103.0 -16.0  ...  48.9   \n",
       "\n",
       "   218  219   220  221  222  223   224   225  226  \n",
       "0 -0.3  0.0   9.0 -0.9  0.9  2.9  23.3  49.4  1.0  \n",
       "1 -0.5  0.0   8.5  0.0  0.2  2.1  20.4  38.8  0.0  \n",
       "2  0.9  0.0   9.5 -2.4  0.3  3.4  12.3  49.0  0.0  \n",
       "3  0.1  0.0  12.2 -2.2  0.4  2.6  34.6  61.6  0.0  \n",
       "4 -0.4  0.0  13.1 -3.6 -0.1  3.9  25.4  62.8  1.0  \n",
       "\n",
       "[5 rows x 227 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./data/arrhythmia.csv',header=None,skiprows=1)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ForestCover"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2021-05-05 13:44:56--  https://kdd.ics.uci.edu/databases/covertype/covtype.data.gz\n",
      "Resolving kdd.ics.uci.edu... 128.195.1.86\n",
      "Connecting to kdd.ics.uci.edu|128.195.1.86|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 11240707 (11M) [application/x-gzip]\n",
      "Saving to: ‘./data/forest/covtype.data.gz’\n",
      "\n",
      "covtype.data.gz     100%[===================>]  10.72M  7.59MB/s    in 1.4s    \n",
      "\n",
      "2021-05-05 13:44:58 (7.59 MB/s) - ‘./data/forest/covtype.data.gz’ saved [11240707/11240707]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://kdd.ics.uci.edu/databases/covertype/covtype.data.gz -P ./data/forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(581012, 55)\n",
      "(495141, 55)\n",
      "(495141, 50)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>44</th>\n",
       "      <th>45</th>\n",
       "      <th>46</th>\n",
       "      <th>47</th>\n",
       "      <th>48</th>\n",
       "      <th>49</th>\n",
       "      <th>51</th>\n",
       "      <th>52</th>\n",
       "      <th>53</th>\n",
       "      <th>54</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2804</td>\n",
       "      <td>139</td>\n",
       "      <td>9</td>\n",
       "      <td>268</td>\n",
       "      <td>65</td>\n",
       "      <td>3180</td>\n",
       "      <td>234</td>\n",
       "      <td>238</td>\n",
       "      <td>135</td>\n",
       "      <td>6121</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2785</td>\n",
       "      <td>155</td>\n",
       "      <td>18</td>\n",
       "      <td>242</td>\n",
       "      <td>118</td>\n",
       "      <td>3090</td>\n",
       "      <td>238</td>\n",
       "      <td>238</td>\n",
       "      <td>122</td>\n",
       "      <td>6211</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2579</td>\n",
       "      <td>132</td>\n",
       "      <td>6</td>\n",
       "      <td>300</td>\n",
       "      <td>-15</td>\n",
       "      <td>67</td>\n",
       "      <td>230</td>\n",
       "      <td>237</td>\n",
       "      <td>140</td>\n",
       "      <td>6031</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2886</td>\n",
       "      <td>151</td>\n",
       "      <td>11</td>\n",
       "      <td>371</td>\n",
       "      <td>26</td>\n",
       "      <td>5253</td>\n",
       "      <td>234</td>\n",
       "      <td>240</td>\n",
       "      <td>136</td>\n",
       "      <td>4051</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>2742</td>\n",
       "      <td>134</td>\n",
       "      <td>22</td>\n",
       "      <td>150</td>\n",
       "      <td>69</td>\n",
       "      <td>3215</td>\n",
       "      <td>248</td>\n",
       "      <td>224</td>\n",
       "      <td>92</td>\n",
       "      <td>6091</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 50 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      0    1   2    3    4     5    6    7    8     9   ...  44  45  46  47  \\\n",
       "2   2804  139   9  268   65  3180  234  238  135  6121  ...   0   0   0   0   \n",
       "3   2785  155  18  242  118  3090  238  238  122  6211  ...   0   0   0   0   \n",
       "5   2579  132   6  300  -15    67  230  237  140  6031  ...   0   0   0   0   \n",
       "11  2886  151  11  371   26  5253  234  240  136  4051  ...   0   0   0   0   \n",
       "12  2742  134  22  150   69  3215  248  224   92  6091  ...   0   0   0   0   \n",
       "\n",
       "    48  49  51  52  53  54  \n",
       "2    0   0   0   0   0   1  \n",
       "3    0   0   0   0   0   1  \n",
       "5    0   0   0   0   0   1  \n",
       "11   0   0   0   0   0   1  \n",
       "12   0   0   0   0   0   1  \n",
       "\n",
       "[5 rows x 50 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./data/forest/covtype.data',header=None)\n",
    "print(df.shape)\n",
    "df = df[df[54]<=2]\n",
    "df[54] = df[54] - 1\n",
    "print(df.shape)\n",
    "df = df.loc[:, df.std() > 0]\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('./data/forest.csv',sep=',',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>40</th>\n",
       "      <th>41</th>\n",
       "      <th>42</th>\n",
       "      <th>43</th>\n",
       "      <th>44</th>\n",
       "      <th>45</th>\n",
       "      <th>46</th>\n",
       "      <th>47</th>\n",
       "      <th>48</th>\n",
       "      <th>49</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2804</td>\n",
       "      <td>139</td>\n",
       "      <td>9</td>\n",
       "      <td>268</td>\n",
       "      <td>65</td>\n",
       "      <td>3180</td>\n",
       "      <td>234</td>\n",
       "      <td>238</td>\n",
       "      <td>135</td>\n",
       "      <td>6121</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2785</td>\n",
       "      <td>155</td>\n",
       "      <td>18</td>\n",
       "      <td>242</td>\n",
       "      <td>118</td>\n",
       "      <td>3090</td>\n",
       "      <td>238</td>\n",
       "      <td>238</td>\n",
       "      <td>122</td>\n",
       "      <td>6211</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2579</td>\n",
       "      <td>132</td>\n",
       "      <td>6</td>\n",
       "      <td>300</td>\n",
       "      <td>-15</td>\n",
       "      <td>67</td>\n",
       "      <td>230</td>\n",
       "      <td>237</td>\n",
       "      <td>140</td>\n",
       "      <td>6031</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2886</td>\n",
       "      <td>151</td>\n",
       "      <td>11</td>\n",
       "      <td>371</td>\n",
       "      <td>26</td>\n",
       "      <td>5253</td>\n",
       "      <td>234</td>\n",
       "      <td>240</td>\n",
       "      <td>136</td>\n",
       "      <td>4051</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2742</td>\n",
       "      <td>134</td>\n",
       "      <td>22</td>\n",
       "      <td>150</td>\n",
       "      <td>69</td>\n",
       "      <td>3215</td>\n",
       "      <td>248</td>\n",
       "      <td>224</td>\n",
       "      <td>92</td>\n",
       "      <td>6091</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 50 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1   2    3    4     5    6    7    8     9   ...  40  41  42  43  \\\n",
       "0  2804  139   9  268   65  3180  234  238  135  6121  ...   0   0   0   0   \n",
       "1  2785  155  18  242  118  3090  238  238  122  6211  ...   0   0   0   0   \n",
       "2  2579  132   6  300  -15    67  230  237  140  6031  ...   0   0   0   0   \n",
       "3  2886  151  11  371   26  5253  234  240  136  4051  ...   0   0   0   0   \n",
       "4  2742  134  22  150   69  3215  248  224   92  6091  ...   0   0   0   0   \n",
       "\n",
       "   44  45  46  47  48  49  \n",
       "0   0   0   0   0   0   1  \n",
       "1   0   0   0   0   0   1  \n",
       "2   0   0   0   0   0   1  \n",
       "3   0   0   0   0   0   1  \n",
       "4   0   0   0   0   0   1  \n",
       "\n",
       "[5 rows x 50 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./data/forest.csv',header=None,skiprows=1)\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KDD 99"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2021-05-05 13:55:59--  http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\n",
      "Resolving kdd.ics.uci.edu... 128.195.1.86\n",
      "Connecting to kdd.ics.uci.edu|128.195.1.86|:80... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 2144903 (2.0M) [application/x-gzip]\n",
      "Saving to: ‘./data/kdd99/kddcup.data_10_percent.gz’\n",
      "\n",
      "kddcup.data_10_perc 100%[===================>]   2.04M  1.73MB/s    in 1.2s    \n",
      "\n",
      "2021-05-05 13:56:00 (1.73 MB/s) - ‘./data/kdd99/kddcup.data_10_percent.gz’ saved [2144903/2144903]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz -P ./data/kdd99"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(494021, 42)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>32</th>\n",
       "      <th>33</th>\n",
       "      <th>34</th>\n",
       "      <th>35</th>\n",
       "      <th>36</th>\n",
       "      <th>37</th>\n",
       "      <th>38</th>\n",
       "      <th>39</th>\n",
       "      <th>40</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>181</td>\n",
       "      <td>5450</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>9</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>239</td>\n",
       "      <td>486</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>19</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>235</td>\n",
       "      <td>1337</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>29</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>219</td>\n",
       "      <td>1337</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>39</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>217</td>\n",
       "      <td>2032</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>49</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 42 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0    1     2   3    4     5  6  7  8  9  ...  32   33   34    35   36   37  \\\n",
       "0  0  tcp  http  SF  181  5450  0  0  0  0  ...   9  1.0  0.0  0.11  0.0  0.0   \n",
       "1  0  tcp  http  SF  239   486  0  0  0  0  ...  19  1.0  0.0  0.05  0.0  0.0   \n",
       "2  0  tcp  http  SF  235  1337  0  0  0  0  ...  29  1.0  0.0  0.03  0.0  0.0   \n",
       "3  0  tcp  http  SF  219  1337  0  0  0  0  ...  39  1.0  0.0  0.03  0.0  0.0   \n",
       "4  0  tcp  http  SF  217  2032  0  0  0  0  ...  49  1.0  0.0  0.02  0.0  0.0   \n",
       "\n",
       "    38   39   40  target  \n",
       "0  0.0  0.0  0.0       1  \n",
       "1  0.0  0.0  0.0       1  \n",
       "2  0.0  0.0  0.0       1  \n",
       "3  0.0  0.0  0.0       1  \n",
       "4  0.0  0.0  0.0       1  \n",
       "\n",
       "[5 rows x 42 columns]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./data/kdd99/kddcup.data_10_percent',header=None)\n",
    "df = df.assign(target = lambda x: x[41]=='normal.')\n",
    "df['target'] = df['target'].astype(int)\n",
    "df = df.drop(columns=[41])\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "drop_cols = []\n",
    "for col in df.columns:\n",
    "    if df[col].dtypes != 'object' and df[col].std()<=0:\n",
    "        drop_cols.append(col)\n",
    "        \n",
    "df = df.drop(columns = drop_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('./data/kdd99.csv',sep=',',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>30</th>\n",
       "      <th>31</th>\n",
       "      <th>32</th>\n",
       "      <th>33</th>\n",
       "      <th>34</th>\n",
       "      <th>35</th>\n",
       "      <th>36</th>\n",
       "      <th>37</th>\n",
       "      <th>38</th>\n",
       "      <th>39</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>181</td>\n",
       "      <td>5450</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>9</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>239</td>\n",
       "      <td>486</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>19</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>235</td>\n",
       "      <td>1337</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>29</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>219</td>\n",
       "      <td>1337</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>39</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>217</td>\n",
       "      <td>2032</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>49</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0    1     2   3    4     5   6   7   8   9   ...  30   31   32    33   34  \\\n",
       "0   0  tcp  http  SF  181  5450   0   0   0   0  ...   9  1.0  0.0  0.11  0.0   \n",
       "1   0  tcp  http  SF  239   486   0   0   0   0  ...  19  1.0  0.0  0.05  0.0   \n",
       "2   0  tcp  http  SF  235  1337   0   0   0   0  ...  29  1.0  0.0  0.03  0.0   \n",
       "3   0  tcp  http  SF  219  1337   0   0   0   0  ...  39  1.0  0.0  0.03  0.0   \n",
       "4   0  tcp  http  SF  217  2032   0   0   0   0  ...  49  1.0  0.0  0.02  0.0   \n",
       "\n",
       "    35   36   37   38  39  \n",
       "0  0.0  0.0  0.0  0.0   1  \n",
       "1  0.0  0.0  0.0  0.0   1  \n",
       "2  0.0  0.0  0.0  0.0   1  \n",
       "3  0.0  0.0  0.0  0.0   1  \n",
       "4  0.0  0.0  0.0  0.0   1  \n",
       "\n",
       "[5 rows x 40 columns]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./data/kdd99.csv',header=None,skiprows=1)\n",
    "df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
